home *** CD-ROM | disk | FTP | other *** search
/ InterCD 2001 May / may_2001.iso / intercd / root / Multimedia / ^DivX_Article / virtualdub / VirtualDub-source-1_4d / mpeg_idct.cpp < prev    next >
Encoding:
C/C++ Source or Header  |  2001-03-20  |  36.6 KB  |  1,682 lines

  1. //    VirtualDub - Video processing and capture application
  2. //    Copyright (C) 1998-2001 Avery Lee
  3. //
  4. //    This program is free software; you can redistribute it and/or modify
  5. //    it under the terms of the GNU General Public License as published by
  6. //    the Free Software Foundation; either version 2 of the License, or
  7. //    (at your option) any later version.
  8. //
  9. //    This program is distributed in the hope that it will be useful,
  10. //    but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. //    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12. //    GNU General Public License for more details.
  13. //
  14. //    You should have received a copy of the GNU General Public License
  15. //    along with this program; if not, write to the Free Software
  16. //    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  17.  
  18. ///////////////////////////////////////////////////////////////////////////
  19. //
  20. //                                 WARNING
  21. //
  22. // This code is heavily based off of the Java MPEG video player written by
  23. // Joerg Anders.  Because his code was released under the GNU GPL v2, this
  24. // means VirtualDub must also be released under GNU GPL v2 when MPEG
  25. // support is included.
  26. //
  27. // (Like that's any different.)
  28. //
  29. // This code is really nasty...
  30. //
  31. ///////////////////////////////////////////////////////////////////////////
  32.  
  33. #include <math.h>
  34.  
  35. #include "mpeg_idct.h"
  36. #include "cpuaccel.h"
  37.  
  38. #pragma warning(disable: 4799)        // function has no EMMS instruction
  39.  
  40. extern "C" unsigned char YUV_clip_table[];
  41.  
  42. ///////////////////////////////////////////////////////////////////////////
  43.  
  44. // enable this define for CPU profiling
  45.  
  46. //#define PROFILE
  47.  
  48. #define    CYCLES_PER_SECOND        (200000000)
  49.  
  50. ///////////////////////////////////////////////////////////////////////////
  51.  
  52. //#define FAST_IMUL
  53.  
  54. #ifdef PROFILE
  55.     static long profile_start;
  56.     static long profile_last;
  57.     static long profile_fastput_cycles;
  58.     static long profile_fastadd_cycles;
  59.     static long profile_slow_cycles;
  60.     static long profile_fastputs;
  61.     static long profile_fastadds;
  62.     static long profile_slows;
  63.  
  64.     #include <stdio.h>
  65.     #include <windows.h>
  66.  
  67.     extern "C" void ODS(const char *s) { OutputDebugString(s); }
  68.  
  69.     void profile_update() {
  70.         char buf[256];
  71.         long pcnt = (profile_fastput_cycles + profile_fastadd_cycles + profile_slow_cycles)/(CYCLES_PER_SECOND/1000);
  72.  
  73.         sprintf(buf, "%ld fast puts (%d/idct), %ld fast adds (%d/idct), %ld slows (%d/idct), total %3d.%c%% of CPU\n"
  74.             ,profile_fastputs
  75.             ,profile_fastputs ? profile_fastput_cycles/profile_fastputs:0
  76.             ,profile_fastadds
  77.             ,profile_fastadds ? profile_fastadd_cycles/profile_fastadds:0
  78.             ,profile_slows
  79.             ,profile_slows ? profile_slow_cycles/profile_slows:0
  80.             ,pcnt/10,pcnt%10 + '0'
  81.             );
  82.  
  83.         OutputDebugString(buf);
  84.  
  85.         profile_last += CYCLES_PER_SECOND;
  86.         profile_fastputs = profile_fastadds = profile_slows = 0;
  87.         profile_fastput_cycles = profile_fastadd_cycles = profile_slow_cycles = 0;
  88.     }
  89. #endif
  90.  
  91.  
  92. #ifdef BE_REALLY_SLOW
  93.  
  94. #define C6            2*sin(pi/8)
  95. #define C4C6        2*sqrt(2)*sin(pi/8)
  96. #define C4            sqrt(2)
  97. #define Q            2*(cos(pi/8)-sin(pi/8))
  98. #define C4Q            2*sqrt(2)*(cos(pi/8)-sin(pi/8))
  99. #define R            2*(cos(pi/8)+sin(pi/8))
  100. #define C4R            2*sqrt(2)*(cos(pi/8)+sin(pi/8))
  101.  
  102. #else
  103.  
  104. // For some reason, adding +1 to C6 or -1 to R (but not both) is enough
  105. // to bring this IDCT into IEEE-1180 compliance.  C6+1 is the better
  106. // adjustment.
  107.  
  108. #define C6            ((int)(0.7653668647094 * 2048.0 + 0.5))+1
  109. #define C4C6        ((int)(1.082392200292  * 2048.0 + 0.5))
  110. #define C4            ((int)(1.414213562373  * 2048.0 + 0.5))
  111. #define Q            ((int)(1.08239220029   * 2048.0 + 0.5))
  112. #define C4Q            ((int)(1.53073372946   * 2048.0 + 0.5))
  113. #define R            ((int)(2.613125929753  * 2048.0 + 0.5))
  114. #define C4R            ((int)(3.69551813004   * 2048.0 + 0.5))
  115.  
  116. //    C6        = 1567
  117. //    C4C6    = 2217
  118. //    C4        = 2896
  119. //    Q        = 2217
  120. //    C4Q        = 3135
  121. //    R        = 5352
  122. //    C4R        = 7568
  123.  
  124. #endif
  125.  
  126. #define MATH_PI        (3.141592653589793238462643)
  127.  
  128. static struct {
  129.     int data[64][64];
  130. } idct_precomputed;
  131.  
  132. int dct_coeff[64];
  133. static int matr1[64], matr2[64];
  134.  
  135. extern void MJPEG_IDCT(int *);
  136.  
  137. void IDCT_init() {
  138.     int x, y;
  139.     for (int i=0; i<64; i++) {
  140.         double d = 0.25 * 256.0;
  141.  
  142.         x = i % 8;
  143.         y = i / 8;
  144.  
  145.         if (!x && !y)
  146.             d = 0.125 * 256.0;
  147.         else if (!x || !y)
  148.             d = 0.125 * sqrt(2) * 256.0;
  149.  
  150.         for (int k = 0; k < 8; k++) {
  151.             for (int l = 0; l < 8; l++) {
  152.                 idct_precomputed.data[i][k * 8 + l] = (int)floor(0.5 + d * cos(MATH_PI * y * (2*k+1) / 16.0) * cos(MATH_PI * x * (2*l+1) / 16.0));
  153.             }
  154.         }
  155. #if 0
  156.         unsigned char out[64];
  157.  
  158.         for(k=0; k<64; k++)
  159.             dct_coeff[k] = 0;
  160.  
  161.         dct_coeff[i] = 4;
  162.  
  163.         IDCT_norm(dct_coeff);
  164.  
  165.         dct_coeff[0] += 262144 + 1024;
  166.  
  167.         IDCT(out, 8, true);
  168.  
  169.         if (i)
  170.             for(k=0; k<64; k++)
  171.                 if (abs(((int)out[k] - 128) - idct_precomputed.data[i][k]/4)>1)
  172.                     __asm int 3
  173. #endif
  174.     }
  175. }
  176.  
  177. void __declspec(naked) IDCT_fast_put(int pos, void *dst, long pitch) {
  178.     __asm {
  179.         push    esi
  180.         push    edi
  181.         push    ebp
  182.         push    ebx
  183.  
  184. #ifdef PROFILE
  185.         rdtsc
  186.         mov        profile_start, eax
  187. #endif
  188.  
  189.         mov        esi,[esp+4+16]                ;eax = coefficient #
  190.         xor        ecx,ecx
  191.         mov        eax,[dct_coeff + esi*4]        ;ecx = coefficient
  192.         mov        [dct_coeff + esi*4],ecx
  193.         mov        edi,[esp+8+16]                ;edi = dest
  194.  
  195.         or        esi,esi
  196.         jnz        AC_coeff
  197.  
  198.         ;we're doing the DC coefficient... easy!
  199.  
  200.         sar        eax,11
  201.         mov        ecx,[esp+12+16]                ;esi = pitch
  202.         jns        DC_above_zero
  203.         xor        eax,eax
  204. DC_above_zero:
  205.         cmp        eax,256
  206.         jb        DC_below_255
  207.         mov        eax,255
  208. DC_below_255:
  209.  
  210.         test    MMX_enabled,1
  211.         jnz        DC_with_MMX
  212.  
  213.         mov        ebx,eax
  214.         mov        edx,4
  215.         shl        eax,8
  216.         or        ebx,eax
  217.         mov        eax,ebx
  218.         shl        eax,16
  219.         or        eax,ebx
  220.  
  221. DC_loop:
  222.         mov        [edi+0],eax
  223.         mov        [edi+4],eax
  224.         mov        [edi+ecx+0],eax
  225.         mov        [edi+ecx+4],eax
  226.         lea        edi,[edi+ecx*2]
  227.         dec        edx
  228.         jne        DC_loop
  229.         jmp        short fnexit
  230.  
  231.  
  232. DC_with_MMX:
  233.         movd    mm6,eax
  234.         punpcklbw    mm6,mm6
  235.         punpcklwd    mm6,mm6
  236.         mov            edx,4
  237.         punpckldq    mm6,mm6
  238.  
  239. DC_loop_MMX:
  240.         movq        [edi],mm6
  241.         movq        [edi+ecx],mm6
  242.         lea            edi,[edi+ecx*2]
  243.         dec            edx
  244.         jne            DC_loop_MMX
  245.         jmp            short fnexit
  246.  
  247.  
  248.         ;AC coefficient... we have to scale the table.  Damn.
  249.         ;
  250.         ;    eax = AC coefficient
  251.         ;    ecx = DC coefficient
  252.         ;    edx = V counter
  253.         ;    esi    = table pointer
  254.         ;    ebp = H counter
  255.  
  256. AC_coeff:
  257.         shl        esi,8
  258.         mov        ecx,dct_coeff
  259.         add        esi,offset idct_precomputed
  260.         mov        dword ptr [dct_coeff],0
  261.         shl        ecx,8
  262.         mov        edx,8
  263.  
  264.         test    MMX_enabled,1
  265.         jnz        AC_with_MMX
  266.  
  267. AC_loop_vert:
  268.         mov        ebp,8
  269. AC_loop_horiz:
  270.         mov        ebx,eax
  271.         imul    ebx,[esi]
  272.         add        ebx,ecx
  273.         sar        ebx,19
  274.         mov        bl,[YUV_clip_table+ebx+256]
  275.         mov        [edi],bl
  276.         add        esi,4
  277.         inc        edi
  278.         dec        ebp
  279.         jne        AC_loop_horiz
  280.  
  281.         mov        ebx,[esp+12+16]
  282.         dec        edx
  283.         lea        edi,[edi+ebx-8]
  284.         jne        AC_loop_vert
  285.  
  286. fnexit:
  287. #ifdef PROFILE
  288.         rdtsc
  289.         mov        ebx,eax
  290.         sub        eax,profile_start
  291.         inc        profile_fastputs
  292.         add        profile_fastput_cycles,eax
  293.         sub        ebx,profile_last
  294.         cmp        ebx,CYCLES_PER_SECOND
  295.         jb        notsec
  296.         call    profile_update
  297. notsec:
  298. #endif
  299.         pop        ebx
  300.         pop        ebp
  301.         pop        edi
  302.         pop        esi
  303.         ret
  304.  
  305. AC_with_MMX:
  306.         movd    mm6,ecx
  307.         pxor    mm7,mm7
  308.         mov        ebx,eax
  309.         and        eax,7fff8000h
  310.         shr        eax,15
  311.         mov        ebp,ebx
  312.         sar        ebp,31
  313.         sub        eax,ebp
  314.         movd    mm4,eax
  315.         punpckldq    mm6,mm6
  316.         mov        eax,ebx
  317.         and        ebx,80000000h
  318.         shr        ebx,16
  319.         and        eax,00007fffh
  320.         or        eax,ebx
  321.         punpckldq    mm4,mm4
  322.         movd    mm5,eax
  323.         punpckldq    mm5,mm5
  324.  
  325. AC_loop_vert_MMX:
  326.         movq    mm0,[esi]            ;AC pattern #0, #1
  327.         movq    mm2,[esi+8]            ;AC pattern #2, #3
  328.         movq    mm1,mm0
  329.         movq    mm3,mm2
  330.         pmaddwd    mm0,mm4
  331.         pmaddwd    mm2,mm4
  332.         add        esi,32
  333.         pmaddwd    mm1,mm5
  334.         pmaddwd    mm3,mm5
  335.         pslld    mm0,15
  336.         pslld    mm2,15
  337.         paddd    mm1,mm0
  338.         paddd    mm3,mm2
  339.         paddd    mm1,mm6
  340.         paddd    mm3,mm6
  341.         psrad    mm1,19
  342.         psrad    mm3,19
  343.         packssdw    mm1,mm3
  344.  
  345.         movq    mm0,[esi-16]        ;AC pattern #4, #5
  346.         movq    mm2,[esi-8]            ;AC pattern #6, #7
  347.         movq    mm7,mm0
  348.         movq    mm3,mm2
  349.         pmaddwd    mm0,mm4
  350.         pmaddwd    mm2,mm4
  351.         pmaddwd    mm7,mm5
  352.         pmaddwd    mm3,mm5
  353.         pslld    mm0,15
  354.         pslld    mm2,15
  355.         paddd    mm0,mm7
  356.         paddd    mm2,mm3
  357.         paddd    mm0,mm6
  358.         paddd    mm2,mm6
  359.         psrad    mm0,19
  360.         psrad    mm2,19
  361.         packssdw    mm0,mm2
  362.         packuswb    mm1,mm0
  363.  
  364.         mov        ebx,[esp+12+16]
  365.         dec        edx
  366.  
  367.         movq    [edi],mm1
  368.  
  369.         lea        edi,[edi+ebx]
  370.         jne        AC_loop_vert_MMX
  371.  
  372.         jmp        short fnexit
  373.     }
  374. }
  375.  
  376. void __declspec(naked) IDCT_fast_add(int pos, void *dst, long pitch) {
  377.     __asm {
  378.         push    esi
  379.         push    edi
  380.         push    ebp
  381.         push    ebx
  382. #ifdef PROFILE
  383.         rdtsc
  384.         mov        profile_start, eax
  385. #endif
  386.  
  387.         mov        esi,[esp+4+16]                ;eax = coefficient #
  388.         xor        ecx,ecx
  389.         mov        eax,[dct_coeff + esi*4]        ;ecx = coefficient
  390.         mov        edi,[esp+8+16]                ;edi = dest
  391.         mov        [dct_coeff + esi*4],ecx        ;clear coefficient :)
  392.  
  393.         or        esi,esi
  394.         jnz        AC_coeff
  395.  
  396.         ;we're doing the DC coefficient... easy!
  397.  
  398.         sar        eax,11
  399.         mov        ebp,[esp+12+16]
  400.  
  401.         test    MMX_enabled,1
  402.         jnz        DC_with_MMX
  403.  
  404.         add        eax,offset YUV_clip_table+256
  405.         mov        esi,8
  406.  
  407.         push    ebp
  408.         mov        ebp,eax
  409.         xor        eax,eax
  410.         xor        ebx,ebx
  411.         xor        ecx,ecx
  412.         xor        edx,edx
  413. DC_loop:
  414.         mov        al,[edi+0]
  415.         mov        bl,[edi+4]
  416.         mov        cl,[edi+1]
  417.         mov        dl,[edi+5]
  418.         mov        al,[ebp+eax]
  419.         mov        bl,[ebp+ebx]
  420.         mov        cl,[ebp+ecx]
  421.         mov        dl,[ebp+edx]
  422.         mov        [edi+0],al
  423.         mov        [edi+4],bl
  424.         mov        [edi+1],cl
  425.         mov        [edi+5],dl
  426.  
  427.         mov        al,[edi+2]
  428.         mov        bl,[edi+6]
  429.         mov        cl,[edi+3]
  430.         mov        dl,[edi+7]
  431.         mov        al,[ebp+eax]
  432.         mov        bl,[ebp+ebx]
  433.         mov        cl,[ebp+ecx]
  434.         mov        dl,[ebp+edx]
  435.         mov        [edi+2],al
  436.         mov        [edi+6],bl
  437.         mov        [edi+3],cl
  438.         mov        [edi+7],dl
  439.  
  440.         add        edi,[esp]
  441.         dec        esi
  442.         jne        DC_loop
  443.         pop        ebp
  444.         jmp        short fnexit
  445.  
  446. DC_with_MMX:
  447.         movd    mm6,eax
  448.         pxor        mm7,mm7
  449.         punpcklbw    mm6,mm6
  450.         punpcklwd    mm6,mm6
  451.         mov            edx,4
  452.         punpckldq    mm6,mm6
  453.         psubb        mm7,mm6
  454.         or            eax,eax
  455.         js            DC_loop_MMX_sub
  456.  
  457. DC_loop_MMX_add:
  458.         movq        mm0,[edi]
  459.         movq        mm1,[edi+ebp]
  460.         paddusb        mm0,mm6
  461.         paddusb        mm1,mm6
  462.         movq        [edi],mm0
  463.         movq        [edi+ebp],mm1
  464.         lea            edi,[edi+ebp*2]
  465.         dec            edx
  466.         jne            DC_loop_MMX_add
  467.         jmp            short fnexit
  468.  
  469. DC_loop_MMX_sub:
  470.         movq        mm0,[edi]
  471.         movq        mm1,[edi+ebp]
  472.         psubusb        mm0,mm7
  473.         psubusb        mm1,mm7
  474.         movq        [edi],mm0
  475.         movq        [edi+ebp],mm1
  476.         lea            edi,[edi+ebp*2]
  477.         dec            edx
  478.         jne            DC_loop_MMX_sub
  479.         jmp            short fnexit
  480.  
  481.         ;AC coefficient... we have to scale the table.  Damn.
  482.         ;
  483.         ;    eax = AC coefficient
  484.         ;    ecx = DC coefficient
  485.         ;    edx = V counter
  486.         ;    esi    = table pointer
  487.         ;    ebp = H counter
  488.  
  489. AC_coeff:
  490.         shl        esi,8
  491.         mov        ecx,dct_coeff
  492.         add        esi,offset idct_precomputed
  493.         mov        dword ptr [dct_coeff],0
  494.         shl        ecx,8
  495.         mov        edx,8
  496.  
  497.         test    MMX_enabled,1
  498.         jnz        AC_with_MMX
  499.  
  500.         push    eax
  501.  
  502. AC_loop_vert:
  503.         mov        ebp,8
  504. AC_loop_horiz:
  505.         mov        ebx,[esp]
  506.         xor        eax,eax
  507.         imul    ebx,[esi]
  508.         add        ebx,ecx
  509.         sar        ebx,19
  510.         mov        al,[edi]
  511.         mov        bl,[YUV_clip_table+ebx+eax+256]
  512.         mov        [edi],bl
  513.         add        esi,4
  514.         inc        edi
  515.         dec        ebp
  516.         jne        AC_loop_horiz
  517.  
  518.         mov        ebx,[esp+12+20]
  519.         dec        edx
  520.         lea        edi,[edi+ebx-8]
  521.         jne        AC_loop_vert
  522.  
  523.         pop        eax
  524.  
  525. fnexit:
  526. #ifdef PROFILE
  527.         rdtsc
  528.         mov        ebx,eax
  529.         sub        eax,profile_start
  530.         inc        profile_fastadds
  531.         add        profile_fastadd_cycles,eax
  532.         sub        ebx,profile_last
  533.         cmp        ebx,CYCLES_PER_SECOND
  534.         jb        notsec
  535.         call    profile_update
  536. notsec:
  537. #endif
  538.         pop        ebx
  539.         pop        ebp
  540.         pop        edi
  541.         pop        esi
  542.         ret
  543.  
  544. AC_with_MMX:
  545.         movd    mm6,ecx
  546.         pxor    mm7,mm7
  547.         mov        ebx,eax
  548.         and        eax,7fff8000h
  549.         shr        eax,15
  550.         mov        ebp,ebx
  551.         sar        ebp,31
  552.         sub        eax,ebp
  553.         movd    mm4,eax
  554.         punpckldq    mm6,mm6
  555.         mov        eax,ebx
  556.         and        ebx,80000000h
  557.         shr        ebx,16
  558.         and        eax,00007fffh
  559.         or        eax,ebx
  560.         punpckldq    mm4,mm4
  561.         movd    mm5,eax
  562.         punpckldq    mm5,mm5
  563.  
  564.  
  565. AC_loop_vert_MMX:
  566.         movq    mm0,[esi]            ;AC pattern #0, #1
  567.         movq    mm2,[esi+8]            ;AC pattern #2, #3
  568.         movq    mm1,mm0
  569.         movq    mm3,mm2
  570.         pmaddwd    mm0,mm4
  571.         pmaddwd    mm2,mm4
  572.         add        esi,32
  573.         pmaddwd    mm1,mm5
  574.         pmaddwd    mm3,mm5
  575.         pslld    mm0,15
  576.         pslld    mm2,15
  577.         paddd    mm0,mm1
  578.         paddd    mm2,mm3
  579.         pxor    mm1,mm1
  580.         movd    mm3,[edi]
  581.         paddd    mm0,mm6
  582.         punpcklbw    mm3,mm1
  583.         paddd    mm2,mm6
  584.         movq    mm7,[esi-16]        ;AC pattern #0, #1
  585.         psrad    mm0,19
  586.         psrad    mm2,19
  587.         packssdw    mm0,mm2
  588.         movq    mm2,[esi-8]            ;AC pattern #2, #3
  589.         paddw        mm0,mm3
  590.  
  591.         movq    mm1,mm7
  592.         movq    mm3,mm2
  593.         pmaddwd    mm7,mm4
  594.         pmaddwd    mm2,mm4
  595.         pmaddwd    mm1,mm5
  596.         pmaddwd    mm3,mm5
  597.         pslld    mm7,15
  598.         pslld    mm2,15
  599.         paddd    mm7,mm1
  600.         paddd    mm2,mm3
  601.         pxor    mm1,mm1
  602.         movd    mm3,[edi+4]
  603.         paddd    mm7,mm6
  604.         punpcklbw    mm3,mm1
  605.         paddd    mm2,mm6
  606.         psrad    mm7,19
  607.         psrad    mm2,19
  608.         packssdw    mm7,mm2
  609.         paddw        mm7,mm3
  610.         packuswb    mm0,mm7
  611.  
  612.         movq    [edi],mm0
  613.  
  614.         mov        ebx,[esp+12+16]
  615.         dec        edx
  616.         lea        edi,[edi+ebx]
  617.         jne        AC_loop_vert_MMX
  618.  
  619.         jmp        short fnexit
  620.     }
  621. }
  622.  
  623. #define coeff dct_coeff
  624.  
  625. void IDCT_norm(int *m1) {
  626.     double d;
  627.     int i,j;
  628.  
  629.     for (j = 0; j < 8; j++) {
  630.         for (i = 0; i < 8; i++) {
  631.             d = (double)m1[j*8+i];
  632.             if (i == 0 && j == 0) {
  633.                 d /= 8.0;
  634.             }
  635.             else if (i == 0 || j == 0) {
  636.                 d /= 4.0 * sqrt(2.0);
  637.             }
  638.             else {
  639.                 d /= 4.0;
  640.             }
  641.             m1[j*8+i] = (int)floor(2048.0 * 16.0 * d * cos(i * (3.1415926535 / 16.0)) * cos(j * (3.1415926535 / 16.0)) + 0.5);
  642.         }
  643.     }
  644. }
  645.  
  646. ///////////////////////////////////////////////////////////////////////////
  647.  
  648. void __declspec(naked) IDCT(void *dst, long modulo, int intra) {
  649.  
  650.     __asm {
  651.         push    esi
  652.         push    edi
  653.         push    ebp
  654.         push    ebx
  655.  
  656. #ifdef PROFILE
  657.         rdtsc
  658.         mov        profile_start, eax
  659. #endif
  660.  
  661.     // compute B1 (horizontal / vertical algoritm):
  662.     // (the vertical part is in tensor product)
  663.     //
  664.     // <0> = [0]
  665.     // <1> = [4]
  666.     // <2> = [2] - [6]
  667.     // <3> = [2] + [6]
  668.     // <4> = [5] - [3]
  669.     // <5> = [1] + [7] - ([5] + [3])
  670.     // <6> = [1] - [7]
  671.     // <7> = [1] + [7] + ([5] + [3])
  672.  
  673.         mov        ebp,7*4
  674.         mov        edi,0
  675.  
  676.     idct_B1_loop:
  677.         mov        eax,[coeff+ebp*8+1*4]        ;eax = [1]
  678.         mov        ebx,[coeff+ebp*8+7*4]        ;ebx = [7]
  679.  
  680.         mov        [coeff+ebp*8+1*4],edi
  681.         mov        [coeff+ebp*8+7*4],edi
  682.  
  683.         mov        edx,[coeff+ebp*8+5*4]        ;edx = [5]
  684.         mov        esi,[coeff+ebp*8+3*4]        ;esi = [3]
  685.  
  686.         mov        [coeff+ebp*8+5*4],edi
  687.         mov        [coeff+ebp*8+3*4],edi
  688.  
  689.         lea        ecx,[eax+ebx]                ;ecx = [1] + [7]
  690.         sub        eax,ebx                        ;eax = [1] - [7]
  691.  
  692.         lea        ebx,[edx+esi]                ;ebx = [5] + [3]
  693.         sub        edx,esi                        ;edx = [5] - [3]
  694.  
  695.         mov        [matr1+ebp+4*32],edx
  696.         mov        esi,[coeff+ebp*8+2*4]        ;esi = [2]
  697.  
  698.         lea        edx,[ecx+ebx]                ;edx = ([1] + [7]) + ([5] + [3])
  699.         mov        [matr1+ebp+6*32],eax
  700.  
  701.         sub        ecx,ebx                        ;ecx = ([1] + [7]) - ([5] + [3])
  702.         mov        ebx,[coeff+ebp*8+6*4]        ;ebx = [6]
  703.  
  704.         mov        [coeff+ebp*8+2*4],edi
  705.         mov        [coeff+ebp*8+6*4],edi
  706.  
  707.         mov        [matr1+ebp+7*32],edx
  708.         mov        edx,[coeff+ebp*8+0*4]        ;edx = [0]
  709.  
  710.         lea        eax,[esi+ebx]                ;eax = [2] + [6]
  711.         mov        [matr1+ebp+5*32],ecx
  712.  
  713.         mov        [matr1+ebp+3*32],eax
  714.         sub        esi,ebx                        ;esi = [2] - [6]
  715.  
  716.         mov        ecx,[coeff+ebp*8+4*4]        ;ecx = [4]
  717.         mov        [matr1+ebp+0*32],edx
  718.  
  719.         mov        [matr1+ebp+2*32],esi
  720.         mov        [coeff+ebp*8+0*4],edi
  721.  
  722.         mov        [coeff+ebp*8+4*4],edi
  723.         mov        [matr1+ebp+1*32],ecx
  724.  
  725.         sub        ebp,4
  726.         jnc        idct_B1_loop
  727.  
  728.     // compute the vertical part and the (tensor product) M:
  729.  
  730.         xor        ebp,ebp
  731.         call    IDCT_vert_0137
  732.         mov        ebp,1*8*4
  733.         call    IDCT_vert_0137
  734.         mov        ebp,3*8*4
  735.         call    IDCT_vert_0137
  736.         mov        ebp,7*8*4
  737.         call    IDCT_vert_0137
  738.         call    IDCT_vert_25
  739.         call    IDCT_vert_46
  740.  
  741.     //    co35=matr1[5+32]-matr1[3+32];
  742.     //    co17=matr1[1+48]-matr1[7+48];
  743.     //    l2 = co35+co17;
  744.     //    l0 = co35-co17;
  745.  
  746.     //    co35=matr1[5+48]-matr1[3+48];
  747.     //    co17=matr1[1+32]-matr1[7+32];
  748.     //    l1 = co35+co17;
  749.     //    l3 = co35-co17; 
  750.  
  751.         mov        ecx,matr1[(5+32)*4]
  752.         mov        esi,matr1[(1+48)*4]
  753.         sub        ecx,matr1[(3+32)*4]        ;ecx = co35
  754.         sub        esi,matr1[(7+48)*4]        ;eax = co17
  755.         mov        edi,matr1[(5+48)*4]
  756.         mov        ebx,matr1[(1+32)*4]
  757.         sub        edi,matr1[(3+48)*4]        ;edi = co35
  758.         sub        ebx,matr1[(7+32)*4]        ;ebx = co17
  759.  
  760.         mov        eax,ecx
  761.         mov        edx,edi
  762.         add        ecx,esi                    ;ecx = l2 = co35+co17
  763.         add        edi,ebx                    ;edi = l1 = co35+co17
  764.         sub        eax,esi                    ;eax = l0 = co35-co17
  765.         sub        edx,ebx                    ;edx = l3 = co35-co17
  766.  
  767.     //    g0 = C4*(l0+l1);
  768.     //    g1 = C4*(l0-l1);
  769.     //    g2 = l2<<12;
  770.     //    g3 = l3<<12;
  771.  
  772.         shl        ecx,12                    ;ecx = g2 = l2<<12
  773.         mov        ebx,eax
  774.         shl        edx,12                    ;edx = g3 = l3<<12
  775.         add        eax,edi                    ;eax = l0+l1    (a)
  776.         sub        ebx,edi                    ;ebx = l0-l1    (b)
  777.         nop
  778.  
  779. #ifdef FAST_IMUL
  780.         imul    eax,2896
  781.         imul    ebx,2896
  782. #else
  783.         shl        ebx,4                    ;b/1
  784.         lea        esi,[eax*8+eax]            ;a/1
  785.         lea        edi,[ebx*8+ebx]            ;b/2
  786.         lea        esi,[esi*4+esi]            ;a/2
  787.         lea        edi,[edi*4+edi]            ;b/3
  788.         lea        eax,[esi*4+eax]            ;a/3
  789.         shl        eax,4                    ;a/4    eax = g0 = C4*(l0+l1)
  790.         lea        ebx,[edi*4+ebx]            ;b/4    ebx = g1 = C4*(l0-l1)
  791. #endif
  792.  
  793.         ;    eax = g0
  794.         ;    ebx = g1
  795.         ;    ecx = g2
  796.         ;    edx = g3
  797.  
  798.     //    matr2[38] = g1+g3;
  799.     //    matr2[52] = g1-g3;
  800.     //    matr2[36] = g2+g0;
  801.     //    matr2[54] = g2-g0;
  802.  
  803.         mov        esi,ebx
  804.         mov        edi,ecx
  805.         add        ebx,edx                    ;ebx = g1+g3
  806.         add        ecx,eax                    ;ecx = g2+g0
  807.         sub        esi,edx                    ;esi = g1-g3
  808.         sub        edi,eax                    ;edi = g2-g0
  809.  
  810.         mov        matr2[(6+32)*4],ebx
  811.         mov        matr2[(4+48)*4],esi
  812.         mov        matr2[(4+32)*4],ecx
  813.         mov        matr2[(6+48)*4],edi
  814.  
  815.     //    tmp = C6*(matr2[32]+matr2[48]);
  816.     //    matr2[32] = -Q*matr2[32]-tmp;
  817.     //    matr2[48] =  R*matr2[48]-tmp;
  818.  
  819.     //    tmp = C6*(matr2[33] + matr2[49]);
  820.     //    matr2[33] = -Q*matr2[33]-tmp;
  821.     //    matr2[49] =  R*matr2[49]-tmp;
  822.  
  823.     //    tmp = C4C6 * (matr2[34] + matr2[50]);
  824.     //    matr2[34] = -C4Q*matr2[34]-tmp;
  825.     //    matr2[50] =  C4R*matr2[50]-tmp;
  826.  
  827.     //    tmp = C6*(matr2[35] + matr2[51]);
  828.     //    matr2[35] = -Q*matr2[35]-tmp;
  829.     //    matr2[51] =  R*matr2[51]-tmp;
  830.  
  831.     //    tmp = C4C6 * (matr2[37] + matr2[53]);
  832.     //    matr2[37] = -C4Q*matr2[37]-tmp;
  833.     //    matr2[53] =  C4R*matr2[53]-tmp;
  834.  
  835.     //    tmp = C6*(matr2[39] + matr2[55]);
  836.     //    matr2[39] = -Q*matr2[39]-tmp;
  837.     //    matr2[55] =  R*matr2[55]-tmp;
  838.  
  839.         ;--- 0,1
  840.  
  841.         mov        eax,matr2[32*4]        ;(1)
  842.         mov        ebx,matr2[33*4]        ;(2)
  843.         mov        ecx,matr2[48*4]        ;(1)
  844.         mov        edx,matr2[49*4]        ;(2)
  845.  
  846.         add        ecx,eax                ;u (1)
  847.         add        edx,ebx                ;v (2)
  848.  
  849.         mov        eax,matr2[32*4]        ;u (1)
  850.         mov        ebx,matr2[48*4]        ;v (1)
  851.  
  852. #ifdef FAST_IMUL
  853.         imul    ecx,1567+1
  854.         imul    eax,2217
  855.         imul    ebx,5352
  856. #else
  857.         lea        ebp,[ecx+ecx*2]        ;u (1c/1) ecx = C6*(matr2[32] + matr2[48])
  858.         nop                            ;v
  859.         lea        esi,[eax+eax]        ;u (1a/1) Q*matr2[32]
  860.         lea        edi,[ebx*4+ebx]        ;v (1b/1) R*matr2[48]
  861.         shl        edi,3                ;u (1b/2)
  862.         nop                            ;v
  863.         shl        ebp,09H                ;u (1c/2)
  864.         lea        esi,[esi*8+eax]        ;v (1a/2)
  865.         shl        ecx,05H                ;u (1c/3)
  866.         lea        ebx,[edi*4+ebx]        ;v (1b/3)
  867.         lea        esi,[esi*4+eax]        ;u (1a/3)
  868.         add        ecx,ebp                ;v (1c/4)
  869.         nop                            ;u
  870.         lea        ebx,[ebx*8+edi]        ;v (1b/4)
  871.         lea        esi,[esi*4+eax]        ;u (1a/4)
  872.         nop                            ;v
  873.  
  874.         lea        ebx,[ebx*4+edi]        ;u (1b/5)
  875.         nop                            ;v
  876.         lea        eax,[esi*8+eax]        ;u (1a/5)
  877. #endif
  878.         sub        ebx,ecx                ;v (1) R*matr2[48]-tmp
  879.         add        ecx,eax                ;u (1) Q*matr2[32]+tmp
  880.         xor        ecx,-1                ;u (1)
  881.         mov        matr2[48*4],ebx        ;v (1)
  882.         inc        ecx                    ;u (1)
  883.         mov        eax,matr2[33*4]        ;v (2)
  884.         mov        matr2[32*4],ecx        ;u (1)
  885.         mov        ebx,matr2[49*4]        ;v (2)
  886.  
  887. #ifdef FAST_IMUL
  888.         imul    edx,1567+1
  889.         imul    eax,2217
  890.         imul    ebx,5352
  891. #else
  892.         lea        ebp,[edx+edx*2]        ;u (2c/1) edx = C6*(matr2[33] + matr2[49])
  893.         nop                            ;v
  894.         lea        esi,[eax+eax]        ;u (2a/1) Q*matr2[33]
  895.         lea        edi,[ebx*4+ebx]        ;v (2b/1) R*matr2[49]
  896.         shl        edi,3                ;u (2b/2)
  897.         nop                            ;v
  898.         shl        ebp,09H                ;u (2c/2)
  899.         lea        esi,[esi*8+eax]        ;v (2a/2)
  900.         nop                            ;u
  901.         lea        ebx,[edi*4+ebx]        ;v (2b/3)
  902.         lea        esi,[esi*4+eax]        ;u (2a/3)
  903.         nop                            ;v
  904.         shl        edx,05H                ;u (2c/3)
  905.         lea        ebx,[ebx*8+edi]        ;v (2b/4)
  906.         lea        esi,[esi*4+eax]        ;u (2a/4)
  907.         add        edx,ebp                ;v (2c/4)
  908.  
  909.         lea        ebx,[ebx*4+edi]        ;u (2b/5)
  910.         nop                            ;v
  911.         lea        eax,[esi*8+eax]        ;u (2a/5)
  912. #endif
  913.         sub        ebx,edx                ;v (2) R*matr2[49]-tmp
  914.         add        edx,eax                ;u (2) Q*matr2[33]+tmp
  915.         xor        edx,-1                ;u (2)
  916.         mov        matr2[49*4],ebx        ;v (2)
  917.         inc        edx                    ;u (2)
  918.         mov        eax,matr2[35*4]        ;v (1) [3,7]
  919.         mov        matr2[33*4],edx        ;u (2)
  920.         mov        ebx,matr2[39*4]        ;v (2) [3,7]
  921.  
  922.         ;--- 3, 7
  923.  
  924.         mov        ecx,matr2[51*4]        ;(1)
  925.         mov        edx,matr2[55*4]        ;(2)
  926.  
  927.         add        ecx,eax                ;(1)
  928.         add        edx,ebx                ;(2)
  929.  
  930.         mov        eax,matr2[35*4]        ;(1)
  931.         mov        ebx,matr2[51*4]        ;(2)
  932.  
  933. #ifdef FAST_IMUL
  934.         imul    ecx,1567+1
  935.         imul    eax,2217
  936.         imul    ebx,5352
  937. #else
  938.         lea        ebp,[ecx+ecx*2]        ;u (1c/1) ecx = C6*(matr2[32] + matr2[48])
  939.         nop                            ;v
  940.         lea        esi,[eax+eax]        ;u (1a/1) Q*matr2[32]
  941.         lea        edi,[ebx*4+ebx]        ;v (1b/1) R*matr2[48]
  942.         shl        edi,3                ;u (1b/2)
  943.         nop                            ;v
  944.         shl        ebp,09H                ;u (1c/2)
  945.         lea        esi,[esi*8+eax]        ;v (1a/2)
  946.         nop                            ;u
  947.         lea        ebx,[edi*4+ebx]        ;v (1b/3)
  948.         lea        esi,[esi*4+eax]        ;u (1a/3)
  949.         nop
  950.         shl        ecx,05H                ;u (1c/3)
  951.         lea        ebx,[ebx*8+edi]        ;v (1b/4)
  952.         lea        esi,[esi*4+eax]        ;u (1a/4)
  953.         add        ecx,ebp                ;v (1c/4)
  954.         lea        ebx,[ebx*4+edi]        ;u (1b/5)
  955.         nop                            ;v
  956.         lea        eax,[esi*8+eax]        ;u (1a/5)
  957.  
  958. #endif
  959.  
  960.         sub        ebx,ecx                ;u (1) R*matr2[48]-tmp
  961.         add        ecx,eax                ;v (1) Q*matr2[32]+tmp
  962.         xor        ecx,-1                ;u (1)
  963.         mov        matr2[51*4],ebx        ;v (1)
  964.         inc        ecx                    ;u (1)
  965.         mov        eax,matr2[39*4]        ;v (2)
  966.         mov        matr2[35*4],ecx        ;u (1)
  967.         mov        ebx,matr2[55*4]        ;v (2)
  968.  
  969. #ifdef FAST_IMUL
  970.         imul    edx,1567+1
  971.         imul    eax,2217
  972.         imul    ebx,5352
  973. #else
  974.         lea        ebp,[edx+edx*2]        ;u (2c/1) edx = C6*(matr2[33] + matr2[49])
  975.         nop                            ;v
  976.         lea        esi,[eax+eax]        ;u (2a/1) Q*matr2[33]
  977.         lea        edi,[ebx*4+ebx]        ;v (2b/1) R*matr2[49]
  978.         shl        edi,3                ;u (2b/2)
  979.         nop                            ;v
  980.         shl        ebp,09H                ;u (2c/2)
  981.         lea        esi,[esi*8+eax]        ;v (2a/2)
  982.         nop                            ;u
  983.         lea        ebx,[edi*4+ebx]        ;v (2b/3)
  984.         lea        esi,[esi*4+eax]        ;u (2a/3)
  985.         nop                            ;v
  986.         shl        edx,05H                ;u (2c/3)
  987.         lea        ebx,[ebx*8+edi]        ;v (2b/4)
  988.         lea        esi,[esi*4+eax]        ;u (2a/4)
  989.         add        edx,ebp                ;v (2c/4)
  990.  
  991.         lea        ebx,[ebx*4+edi]        ;u (2b/5)
  992.         nop                            ;v
  993.         lea        eax,[esi*8+eax]        ;u (2a/5)
  994. #endif
  995.         sub        ebx,edx                ;v (2) R*matr2[49]-tmp
  996.         add        edx,eax                ;u (2) Q*matr2[33]+tmp
  997.         xor        edx,-1                ;u (2)
  998.         mov        matr2[55*4],ebx        ;v (2)
  999.         inc        edx                    ;u (2)
  1000.         mov        eax,matr2[34*4]        ;v (1) [2,5]
  1001.         mov        matr2[39*4],edx        ;u (2)
  1002.         mov        ebx,matr2[37*4]        ;v (2) [2,5]
  1003.  
  1004.         ;--- 2,5
  1005.  
  1006.         mov        ecx,matr2[50*4]        ;u (1)
  1007.         mov        edx,matr2[53*4]        ;v (2)
  1008.  
  1009.         add        ecx,eax                ;u (1)
  1010.         add        edx,ebx                ;v (2)
  1011.  
  1012.         mov        eax,matr2[34*4]        ;u (1)
  1013.         mov        ebx,matr2[50*4]        ;v (1)
  1014.  
  1015. #ifdef FAST_IMUL
  1016.         imul    ebx,7568
  1017.         imul    eax,3135
  1018.         imul    ecx,2217
  1019. #else
  1020.         lea        edi,[ebx*4+ebx]        ;u (1b/1) C4R*matr2[50]
  1021.         lea        esi,[eax*2+eax]        ;v (1a/1) C4Q*matr2[34]
  1022.         lea        ebp,[ecx+ecx]        ;u (1c/1) ecx = C4C6*(matr2[32] + matr2[48])
  1023.         nop
  1024.         lea        ebx,[ebx*8+edi]        ;u (1b/2)
  1025.         lea        eax,[esi*8]            ;v (1a/2)
  1026.         lea        ebp,[ebp*8+ecx]        ;u (1c/2)
  1027.         nop
  1028.         lea        ebx,[ebx*8+ebx]        ;u (1b/3)
  1029.         lea        eax,[eax*8+esi]        ;v (1a/3)
  1030.         lea        ebp,[ebp*4+ecx]        ;u (1c/3)
  1031.         nop
  1032.         lea        ebx,[ebx*4+edi]        ;u (1b/4)
  1033.         lea        eax,[eax*4+esi]        ;v (1a/4)
  1034.         lea        ebp,[ebp*4+ecx]        ;u (1c/4)
  1035.         nop
  1036.         shl        ebx,4                ;u (1b/5)
  1037.         lea        eax,[eax*4+esi]        ;v (1a/5)
  1038.         lea        ecx,[ebp*8+ecx]        ;u (1c/5)
  1039. #endif
  1040.         sub        ebx,ecx                ;u (1) C4R*matr2[50]-tmp
  1041.         add        ecx,eax                ;v (1) C4Q*matr2[34]+tmp
  1042.         xor        ecx,-1                ;u (1)
  1043.         mov        matr2[50*4],ebx        ;v (1)
  1044.  
  1045.         mov        eax,matr2[37*4]        ;u (2)
  1046.         inc        ecx                    ;v (1)
  1047.         mov        ebx,matr2[53*4]        ;u (2)
  1048.         mov        matr2[34*4],ecx        ;v (1)
  1049.  
  1050. #ifdef FAST_IMUL
  1051.         imul    ebx,7568
  1052.         imul    eax,3135
  1053.         imul    edx,2217
  1054. #else
  1055.         lea        ebp,[edx+edx]        ;v (2c/1) edx = C4C6*(matr2[33] + matr2[49])
  1056.         nop                            ;u
  1057.         lea        esi,[eax*2+eax]        ;v (2a/1) C4Q*matr2[37]
  1058.         lea        edi,[ebx*4+ebx]        ;u (2b/1) C4R*matr2[53]
  1059.  
  1060.         lea        ebp,[ebp*8+edx]        ;u (2c/2)
  1061.         nop
  1062.         lea        ebx,[ebx*8+edi]        ;u (2b/2)
  1063.         lea        eax,[esi*8]            ;v (2a/2)
  1064.         lea        ebp,[ebp*4+edx]        ;u (2c/3)
  1065.         nop
  1066.         lea        ebx,[ebx*8+ebx]        ;u (2b/3)
  1067.         lea        eax,[eax*8+esi]        ;(2a/3)
  1068.         lea        ebp,[ebp*4+edx]        ;u (2c/4)
  1069.         nop
  1070.         lea        ebx,[ebx*4+edi]        ;u (2b/4)
  1071.         lea        eax,[eax*4+esi]        ;(2a/4)
  1072.         lea        edx,[ebp*8+edx]        ;u (2c/5)
  1073.         nop
  1074.         shl        ebx,4                ;u (2b/5)
  1075.         lea        eax,[eax*4+esi]        ;(2a/5)
  1076. #endif
  1077.         add        eax,edx                ;u (2) C4Q*matr2[37]+tmp
  1078.         sub        ebx,edx                ;v (2) C4R*matr2[53]-tmp
  1079.         xor        eax,-1                ;u (2)
  1080.         mov        matr2[53*4],ebx        ;v (2)
  1081.         inc        eax                    ;u (2)
  1082.         mov        matr2[37*4],eax        ;u (2)
  1083.  
  1084.         ;--- done
  1085.         
  1086.     // compute A1 x A2 x A3 (horizontal/vertical algoritm):
  1087.  
  1088.         mov        ebp,7*4
  1089.  
  1090.     idct_A1A2_loop:
  1091.         mov        edi,[matr2+ebp*8+0*4]        ;edi = [0]
  1092.         mov        eax,[matr2+ebp*8+1*4]        ;eax = [1]
  1093.  
  1094.         mov        ebx,[matr2+ebp*8+3*4]        ;ebx = [3]
  1095.         mov        ecx,[matr2+ebp*8+2*4]        ;ecx = [2]
  1096.  
  1097.         lea        esi,[edi+eax]                ;esi = [0]+[1]
  1098.         sub        edi,eax                        ;edi = [0]-[1]
  1099.  
  1100.         mov        edx,[matr2+ebp*8+7*4]        ;edx = [7]
  1101.         sub        ecx,ebx                        ;ecx = [2]-[3]
  1102.  
  1103.         lea        eax,[esi+ebx]                ;eax = [0]+[1]+[3]
  1104.         mov        ebx,edi                        ;ebx = [0]-[1]
  1105.  
  1106.         add        ebx,ecx                        ;ebx = [0]-[1]+[2]-[3]
  1107.         sub        edi,ecx                        ;edi = [0]-[1]-[2]+[3]
  1108.  
  1109.         lea        ecx,[eax+edx]                ;ecx = [0]+[1]+[3]+[7]
  1110.         sub        eax,edx                        ;eax = [0]+[1]+[3]-[7]
  1111.  
  1112.         sub        edx,[matr2+ebp*8+6*4]        ;edx = -[6]+[7]
  1113.         sub        esi,[matr2+ebp*8+3*4]        ;esi = [0]+[1]+[3]
  1114.  
  1115.         mov        [matr1+ebp+0*32],ecx
  1116.         mov        ecx,[matr2+ebp*8+5*4]        ;ecx = [5]
  1117.  
  1118.         mov        [matr1+ebp+7*32],eax
  1119.         lea        eax,[ebx+edx]                ;eax = [0]-[1]+[2]-[3]-[6]+[7]
  1120.  
  1121.         sub        ebx,edx                        ;ebx = [0]-[1]+[2]-[3]+[6]-[7]
  1122.         mov        [matr1+ebp+6*32],eax
  1123.  
  1124.         add        edx,ecx                        ;edx = [5]-[6]+[7]
  1125.         mov        ecx,[matr2+ebp*8+4*4]        ;eax = [4]
  1126.  
  1127.         mov        [matr1+ebp+1*32],ebx
  1128.         add        ecx,edx                        ;ecx = [4]+[5]-[6]+[7]
  1129.  
  1130.         lea        ebx,[edi+edx]                ;ebx = [0]-[1]-[2]+[3]+[5]-[6]+[7]
  1131.         sub        edi,edx                        ;edi = [0]-[1]-[2]+[3]-[5]+[6]-[7]
  1132.  
  1133.         lea        eax,[esi+ecx]                ;eax = [0]+[1]-[3]+[4]+[5]-[6]+[7]
  1134.         mov        [matr1+ebp+2*32],ebx
  1135.  
  1136.         mov        [matr1+ebp+5*32],edi
  1137.         sub        esi,ecx                        ;esi = [0]+[1]-[3]-[4]-[5]+[6]-[7]
  1138.  
  1139.         mov        [matr1+ebp+4*32],eax
  1140.         mov        [matr1+ebp+3*32],esi
  1141.  
  1142.         sub        ebp,4
  1143.         jnc        idct_A1A2_loop
  1144.  
  1145.         ;******************************************
  1146.  
  1147.         mov        ebp,-8*32
  1148.         mov        esi,[esp+4+16]
  1149.         mov        eax,esi
  1150.  
  1151.         test    dword ptr [esp+12+16],-1
  1152.         jnz        idct_final_loop_intra
  1153.  
  1154.     idct_final_loop_inter:
  1155.         mov        eax,[matr1+ebp+8*32+0*4]            ;eax = [0]
  1156.         mov        ebx,[matr1+ebp+8*32+1*4]            ;ebx = [1]
  1157.  
  1158.         mov        edi,[matr1+ebp+8*32+3*4]            ;edi = [3]
  1159.         add        eax,ebx            ;eax = [0] + [1]
  1160.  
  1161.         add        eax,edi            ;eax = [0] + [1] + [3]
  1162.         mov        edx,[matr1+ebp+8*32+7*4]            ;edx = [7]
  1163.  
  1164.         mov        ebx,eax            ;ebx = [0] + [1] + [3]
  1165.         add        eax,edx            ;eax = [0] + [1] + [3] + [7]
  1166.  
  1167.         sub        ebx,edx            ;ebx = [0] + [1] + [3] - [7]
  1168.  
  1169.         mov        edx,eax
  1170.         xor        ecx,ecx
  1171.  
  1172.         sar        edx,22
  1173.         mov        cl,[esi+0]
  1174.  
  1175.         mov        dl,[YUV_clip_table+edx+ecx+256]
  1176.         mov        cl,[esi+7]
  1177.  
  1178.         mov        [esi+0],dl
  1179.         mov        edx,ebx
  1180.  
  1181.         sar        edx,22
  1182.  
  1183.         mov        dl,[YUV_clip_table+edx+ecx+256]
  1184.  
  1185.         mov        [esi+7],dl
  1186.         mov        edi,[matr1+ebp+8*32+1*4]            ;edi = [1]
  1187.  
  1188.         mov        edx,[matr1+ebp+8*32+3*4]            ;edx = [3]
  1189.         mov        ecx,[matr1+ebp+8*32+2*4]            ;ecx = [2]
  1190.  
  1191.         add        edi,edx            ;edi = [1] + [3]
  1192.         mov        edx,[matr1+ebp+8*32+6*4]            ;esi = [6]
  1193.  
  1194.         add        edi,edi            ;edi = 2[1] + 2[3]
  1195.         sub        eax,edx            ;eax = [0] + [1] + [3] - [6] + [7]
  1196.  
  1197.         sub        edi,ecx            ;edi = 2[1] - [2] + 2[3]
  1198.         add        ebx,edx            ;ebx = [0] + [1] + [3] + [6] - [7]
  1199.  
  1200.         sub        eax,edi            ;eax = [0] - [1] + [2] - [3] - [6] + [7]
  1201.         sub        ebx,edi            ;ebx = [0] - [1] + [2] - [3] + [6] - [7]
  1202.  
  1203.         mov        ecx,eax
  1204.         xor        edx,edx
  1205.  
  1206.         sar        ecx,22
  1207.         mov        dl,[esi+6]
  1208.  
  1209.         mov        cl,[YUV_clip_table+ecx+edx+256]
  1210.         mov        dl,[esi+1]
  1211.  
  1212.         mov        [esi+6],cl
  1213.         mov        ecx,ebx
  1214.  
  1215.         sar        ecx,22
  1216.  
  1217.         mov        cl,[YUV_clip_table+ecx+edx+256]
  1218.  
  1219.         mov        [esi+1],cl
  1220.         mov        edx,[matr1+ebp+8*32+2*4]
  1221.  
  1222.         sub        edx,[matr1+ebp+8*32+3*4]            ;edx = [2] - [3]
  1223.         mov        ecx,[matr1+ebp+8*32+5*4]
  1224.  
  1225.         add        edx,edx            ;edx = 2[2] - 2[3]
  1226.         add        eax,ecx            ;eax = [0] - [1] + [2] - [3] + [5] - [6] + [7]
  1227.  
  1228.         sub        ebx,ecx            ;ebx = [0] - [1] + [2] - [3] - [5] + [6] - [7]
  1229.         sub        eax,edx            ;eax = [0] - [1] - [2] + [3] + [5] - [6] + [7]
  1230.  
  1231.         sub        ebx,edx            ;ebx = [0] - [1] - [2] + [3] - [5] + [6] - [7]
  1232.         mov        edx,eax
  1233.  
  1234.         sar        edx,22
  1235.         xor        ecx,ecx
  1236.  
  1237.         mov        cl,[esi+2]
  1238.  
  1239.         mov        dl,[YUV_clip_table+ecx+edx+256]
  1240.         mov        cl,[esi+5]
  1241.  
  1242.         mov        [esi+2],dl
  1243.         mov        edx,ebx
  1244.  
  1245.         sar        edx,22
  1246.  
  1247.         mov        dl,[YUV_clip_table+ecx+edx+256]
  1248.  
  1249.         mov        [esi+5],dl
  1250.         mov        ecx,[matr1+ebp+8*32+1*4]
  1251.  
  1252.         shl        ecx,2            ;esi = 4[1]
  1253.         mov        edx,[matr1+ebp+8*32+4*4]
  1254.  
  1255.         add        eax,edx            ;eax = [0] - [1] - [2] + [3] + [4] + [5] - [6] + [7]
  1256.         sub        edi,ecx            ;edi = -2[1] - [2] + 2[3]
  1257.  
  1258.         sub        ebx,edx            ;ebx = [0] - [1] - [2] + [3] - [4] - [5] + [6] - [7]
  1259.         sub        eax,edi            ;eax = [0] + [1]       - [3] - [4] + [5] - [6] + [7]
  1260.  
  1261.         sar        eax,22
  1262.         sub        ebx,edi            ;ebx = [0] + [1]       - [3] + [4] - [5] + [6] - [7]
  1263.  
  1264.         sar        ebx,22
  1265.         xor        ecx,ecx
  1266.  
  1267.         xor        edx,edx
  1268.         mov        edi,[esp+8+16]
  1269.  
  1270.         mov        cl,[esi+4]
  1271.         mov        dl,[esi+3]
  1272.  
  1273.         mov        al,[YUV_clip_table+eax+ecx+256]
  1274.         mov        bl,[YUV_clip_table+ebx+edx+256]
  1275.  
  1276.         mov        [esi+4],al
  1277.         mov        [esi+3],bl
  1278.  
  1279.         add        esi,edi
  1280.  
  1281.         add        ebp,32
  1282.         jnz        idct_final_loop_inter
  1283.  
  1284. finish:
  1285. #ifdef PROFILE
  1286.         rdtsc
  1287.         mov        ebx,eax
  1288.         sub        eax,profile_start
  1289.         inc        profile_slows
  1290.         add        profile_slow_cycles,eax
  1291.         sub        ebx,profile_last
  1292.         cmp        ebx,CYCLES_PER_SECOND
  1293.         jb        notsec
  1294.         call    profile_update
  1295. notsec:
  1296. #endif
  1297.         pop        ebx
  1298.         pop        ebp
  1299.         pop        edi
  1300.         pop        esi
  1301.         ret
  1302.  
  1303.         align    16
  1304. idct_final_loop_intra:
  1305.         mov        cl,[eax]
  1306.  
  1307.         mov        eax,[matr1+ebp+8*32+4*0]    ;eax = [0]
  1308.         mov        ebx,[matr1+ebp+8*32+4*1]    ;ebx = [1]
  1309.  
  1310.         mov        esi,[matr1+ebp+8*32+4*3]    ;esi = [3]
  1311.         mov        edx,[matr1+ebp+8*32+4*2]    ;edx = [2]
  1312.  
  1313.         lea        ecx,[eax+ebx]                ;ecx = [0]+[1]
  1314.         sub        eax,ebx                        ;eax = [0]-[1]
  1315.  
  1316.         add        ecx,esi                        ;ecx = [0]+[1]+[3]
  1317.         mov        ebx,[matr1+ebp+8*32+4*7]    ;ebx = [7]
  1318.  
  1319.         sub        esi,edx                        ;esi = [3]-[2]
  1320.         mov        edi,[matr1+ebp+8*32+4*6]    ;edi = [6]
  1321.  
  1322.         sub        eax,esi                        ;eax = [0]-[1]+[2]-[3]
  1323.         lea        edx,[ecx+ebx]                ;edx = [0]+[1]+[3]+[7]
  1324.  
  1325.         sub        ecx,ebx                        ;ecx = [0]+[1]+[3]-[7]
  1326.         sub        ebx,edi                        ;ebx = -[6]+[7]
  1327.  
  1328.         sar        edx,22                        ;edx = <0>
  1329.         mov        edi,[esp+4+16]
  1330.  
  1331.         sar        ecx,22                        ;ecx = <7>
  1332.         mov        dl,[YUV_clip_table+edx+256]    ;dl = FINAL[0]
  1333.  
  1334.         mov        dh,[YUV_clip_table+ecx+256]    ;dh = FINAL[7]
  1335.         lea        ecx,[eax+ebx]                ;ecx = [0]-[1]+[2]-[3]-[6]+[7]
  1336.  
  1337.         lea        esi,[esi*2+eax]                ;esi = [0]-[1]-[2]+[3]
  1338.         mov        [edi+0],dl
  1339.  
  1340.         mov        [edi+7],dh
  1341.         sub        eax,ebx                        ;eax = [0]-[1]+[2]-[3]+[6]-[7]
  1342.  
  1343.         sar        eax,22
  1344.         mov        edx,[matr1+ebp+8*32+4*5]    ;edi = [5]
  1345.  
  1346.         sar        ecx,22
  1347.         add        ebx,edx                        ;ebx = [5]-[6]+[7]
  1348.  
  1349.         mov        al,[YUV_clip_table+eax+256]    ;al = FINAL[1]
  1350.         mov        edx,[matr1+ebp+8*32+4*0]    ;edx = [0]
  1351.  
  1352.         mov        ah,[YUV_clip_table+ecx+256]    ;ah = FINAL[6]
  1353.         lea        ecx,[esi+ebx]                ;ecx = [0]-[1]-[2]+[3]+[5]-[6]+[7]
  1354.  
  1355.         sub        esi,ebx                        ;esi = [0]-[1]-[2]+[3]-[5]+[6]-[7]
  1356.         mov        [edi+1],al
  1357.  
  1358.         mov        [edi+6],ah
  1359.         mov        eax,[matr1+ebp+8*32+4*1]    ;eax = [1]
  1360.  
  1361.         sar        ecx,22
  1362.         add        edx,eax                        ;edx = [0]+[1]
  1363.  
  1364.         sar        esi,22
  1365.         mov        eax,[matr1+ebp+8*32+4*3]    ;eax = [3]
  1366.  
  1367.         sub        edx,eax                        ;edx = [0]+[1]-[3]
  1368.         mov        eax,[matr1+ebp+8*32+4*4]    ;eax = [4]
  1369.  
  1370.         mov        cl,[YUV_clip_table+ecx+256]    ;cl = FINAL[2]
  1371.         add        ebx,eax                        ;ebx = [4]+[5]-[6]+[7]
  1372.  
  1373.         mov        ch,[YUV_clip_table+esi+256]    ;ch = FINAL[5]
  1374.         mov        eax,[esp+8+16]
  1375.  
  1376.         add        eax,edi
  1377.         lea        esi,[edx+ebx]                ;esi = [0]+[1]-[3]+[4]+[5]-[6]+[7]
  1378.  
  1379.         sar        esi,22
  1380.         sub        edx,ebx                        ;edx = [0]+[1]-[3]-[4]-[5]+[6]-[7]
  1381.  
  1382.         sar        edx,22
  1383.         mov        [edi+2],cl
  1384.  
  1385.         mov        [edi+5],ch
  1386.         mov        bl,[YUV_clip_table+esi+256]
  1387.  
  1388.         mov        cl,[YUV_clip_table+edx+256]
  1389.         mov        [edi+4],bl
  1390.  
  1391.         mov        [edi+3],cl
  1392.         add        ebp,32
  1393.  
  1394.         mov        [esp+4+16],eax
  1395.         jnz        idct_final_loop_intra
  1396.  
  1397.         jmp        short finish
  1398.     }
  1399.  
  1400.  
  1401. IDCT_vert_0137:
  1402.     __asm {
  1403.  
  1404. //    tmp4 = matr1[3+p]-matr1[5+p];
  1405. //    tmp6 = matr1[1+p]-matr1[7+p];
  1406. //    tmp = C6 * (tmp6-tmp4);
  1407. //    matr2[p+4] =  Q*tmp4-tmp;
  1408. //    matr2[p+6] =  R*tmp6-tmp;
  1409.  
  1410.         mov        eax,matr1[ebp+3*4]            ;eax = matr1[3+p]
  1411.         mov        ebx,matr1[ebp+1*4]            ;ebx = matr1[1+p]
  1412.         sub        eax,matr1[ebp+5*4]            ;eax = tmp4 = matr1[3+p] - matr1[5+p]
  1413.         sub        ebx,matr1[ebp+7*4]            ;ebx = tmp6 = matr1[1+p] - matr1[7+p]
  1414.  
  1415. #ifdef FAST_IMUL
  1416.         mov        ecx,ebx
  1417.         sub        ecx,eax                        ;ecx = C6*(tmp6-tmp4)
  1418.         imul    ecx,1567+1
  1419.         imul    eax,2217
  1420.         imul    ebx,5352
  1421. #else
  1422.         lea        esi,[eax+eax]                ;(a/1) Q*tmp4
  1423.         lea        edi,[ebx*4+ebx]                ;(b/1) R*tmp6
  1424.         shl        edi,3                        ;(b/2)
  1425.         mov        ecx,ebx
  1426.  
  1427.         lea        esi,[esi*8+eax]                ;(a/2)
  1428.         sub        ecx,eax                        ;ecx = tmp6-tmp4
  1429.         nop
  1430.         lea        ebx,[edi*4+ebx]                ;(b/3)
  1431.  
  1432.         lea        esi,[esi*4+eax]                ;(a/3)
  1433.         lea        edx,[ecx+ecx*2]                ;(c/1) eax = C6*(tmp6-tmp4)
  1434.         shl        edx,09H                        ;(c/2)
  1435.         lea        ebx,[ebx*8+edi]                ;(b/4)
  1436.  
  1437.         nop                                    ;
  1438.         lea        esi,[esi*4+eax]                ;(a/4)
  1439.         shl        ecx,05H                        ;(c/3)
  1440.         lea        ebx,[ebx*4+edi]                ;(b/5)
  1441.  
  1442.         lea        eax,[esi*8+eax]                ;(a/5)
  1443.         add        ecx,edx                        ;(c/4)
  1444. #endif
  1445.  
  1446.         sub        eax,ecx                        ;eax = Q*tmp4 - tmp;
  1447.         sub        ebx,ecx                        ;ebx = R*tmp6 - tmp;
  1448.  
  1449.         mov        matr2[ebp+4*4],eax            ;matr2[p+4] = Q*tmp4 - tmp;
  1450.         mov        matr2[ebp+6*4],ebx            ;matr2[p+6] = R*tmp6 - tmp;
  1451.  
  1452. //    co17 = matr1[p+1] + matr1[p+7];
  1453. //    co35 = matr1[p+3] + matr1[p+5];
  1454. //    matr2[p+5] =  (co17-co35)*C4;
  1455. //    matr2[p+7] =  (co17+co35)<<11;
  1456.  
  1457. //    matr2[p+2] =  (matr1[p+2]-matr1[p+6])*C4;
  1458. //    matr2[p+3] =  (matr1[p+2]+matr1[p+6]) << 11;
  1459.  
  1460.         mov        eax,matr1[ebp+1*4]            ;eax = matr1[p+1]
  1461.         mov        ebx,matr1[ebp+3*4]            ;ebx = matr1[p+3]
  1462.         add        eax,matr1[ebp+7*4]            ;eax = co17 = matr1[p+1] + matr1[p+7]
  1463.         add        ebx,matr1[ebp+5*4]            ;ebx = co35 = matr1[p+3] + matr1[p+5]
  1464.  
  1465.         mov        ecx,ebx                        ;ecx = co35
  1466.         add        ebx,eax                        ;ebx = co17 + co35
  1467.         shl        ebx,11                        ;ebx = (co17 + co35)<<11
  1468.         sub        eax,ecx                        ;eax = co17 - co35
  1469.         mov        matr2[ebp+7*4],ebx            ;matr2[p+7] = (co17 + co35)<<11
  1470.  
  1471.         mov        ebx,matr1[ebp+2*4]            ;ebx = matr1[p+2]
  1472.         mov        edx,matr1[ebp+6*4]            ;edx = matr1[p+6]
  1473.         mov        ecx,edx                        ;ecx = matr1[p+6]
  1474.         add        edx,ebx                        ;edx = matr1[p+2] + matr1[p+6]
  1475.         shl        edx,11                        ;edx = (matr1[p+2] + matr1[p+6])<<11
  1476.         sub        ebx,ecx                        ;ebx = matr1[p+2] - matr1[p+6]
  1477.         mov        matr2[ebp+3*4],edx            ;matr2[p+3] = (matr1[p+2] + matr1[p+6])<<11
  1478.  
  1479.         ;multiply eax, ebx by C4
  1480.  
  1481. #ifdef FAST_IMUL
  1482.         imul    eax,2896
  1483.         imul    ebx,2896
  1484. #else
  1485.         shl        ebx,4
  1486.         lea        esi,[eax*8+eax]
  1487.  
  1488.         lea        esi,[esi*4+esi]
  1489.         lea        edi,[ebx*8+ebx]
  1490.  
  1491.         lea        eax,[esi*4+eax]
  1492.         lea        edi,[edi*4+edi]
  1493.  
  1494.         shl        eax,4
  1495.         lea        ebx,[edi*4+ebx]
  1496. #endif
  1497.  
  1498. //    matr2[p+0] =  matr1[p+0] << 11;
  1499. //    matr2[p+1] =  matr1[p+4] << 11;
  1500.  
  1501.         mov        ecx,matr1[ebp+0*4]
  1502.         mov        edx,matr1[ebp+4*4]
  1503.  
  1504.         shl        ecx,11
  1505.         mov        matr2[ebp+5*4],eax
  1506.         shl        edx,11
  1507.         mov        matr2[ebp+2*4],ebx
  1508.  
  1509.         mov        matr2[ebp+0*4],ecx
  1510.         mov        matr2[ebp+1*4],edx
  1511.  
  1512.         ret
  1513.     };
  1514.  
  1515.  
  1516. IDCT_vert_25:
  1517.     __asm {
  1518.         mov        ebp,2*8*4
  1519.         call    IDCT_vert_25_dorow
  1520.         mov        ebp,5*8*4
  1521.  
  1522. IDCT_vert_25_dorow:
  1523. //    tmp4 = matr1[p+3]-matr1[p+5];
  1524. //    tmp6 = matr1[p+1]-matr1[p+7];
  1525.         mov        eax, matr1[ebp+3*4]
  1526.         mov        ebx, matr1[ebp+1*4]
  1527.         sub        eax, matr1[ebp+5*4]        ;eax = tmp4
  1528.         sub        ebx, matr1[ebp+7*4]        ;ebx = tmp6
  1529.  
  1530.  
  1531. //    tmp = C4C6 * (tmp6-tmp4);            c
  1532. //    matr2[p+4] = C4Q*tmp4-tmp;            a
  1533. //    matr2[p+6] = C4R*tmp6-tmp;            b
  1534.  
  1535. #ifdef FAST_IMUL
  1536.         mov        ecx,ebx
  1537.         sub        ecx,eax                    ;ecx = tmp6 - tmp4
  1538.         imul    ecx,2217                ;ecx = C4C6*(tmp6 - tmp4)
  1539.         imul    eax,3135
  1540.         imul    ebx,7568
  1541. #else
  1542.         mov        ecx,ebx
  1543.         nop
  1544.         lea        esi,[eax*2+eax]                ;a/1
  1545.         lea        edi,[ebx*4+ebx]                ;b/1
  1546.  
  1547.         sub        ecx,eax                    ;ecx = tmp6 - tmp4
  1548.         nop
  1549.         lea        eax,[esi*8]                    ;a/2
  1550.         lea        ebx,[ebx*8+edi]                ;b/2
  1551.  
  1552.         nop
  1553.         lea        edx,[ecx+ecx]                ;c/1
  1554.         lea        eax,[eax*8+esi]                ;a/3
  1555.         lea        ebx,[ebx*8+ebx]                ;b/3
  1556.  
  1557.         nop
  1558.         lea        edx,[edx*8+ecx]                ;c/2
  1559.         lea        eax,[eax*4+esi]                ;a/4
  1560.         lea        ebx,[ebx*4+edi]                ;b/4
  1561.  
  1562.         shl        ebx,4                        ;b/5
  1563.         lea        edx,[edx*4+ecx]                ;c/3
  1564.  
  1565.         lea        eax,[eax*4+esi]                ;a/5
  1566.         lea        edx,[edx*4+ecx]                ;c/4
  1567.         lea        ecx,[edx*8+ecx]                ;c/5
  1568. #endif
  1569.  
  1570.         sub        eax,ecx
  1571.         sub        ebx,ecx
  1572.         mov        matr2[ebp+4*4],eax
  1573.         mov        matr2[ebp+6*4],ebx
  1574.  
  1575. //    co17 = co1 + co7;
  1576. //    co35 = co3 + co5;
  1577. //    matr2[p+5] = (co17-co35)<<12;
  1578. //    matr2[p+7] = (co17+co35)*C4;        (a)
  1579.  
  1580. //    co2=matr1[p+2];
  1581. //    co6=matr1[p+6];
  1582. //    matr2[p+2] = (co2-co6)<<12;
  1583. //    matr2[p+3] = (co2+co6)*C4;            (b)
  1584.  
  1585.         mov        eax,matr1[ebp+1*4]
  1586.         mov        ebx,matr1[ebp+3*4]
  1587.         mov        ecx,matr1[ebp+2*4]
  1588.         mov        edx,matr1[ebp+6*4]
  1589.         add        eax,matr1[ebp+7*4]        ;eax = co17
  1590.         add        ebx,matr1[ebp+5*4]        ;ebx = co35
  1591.  
  1592.         mov        esi,eax
  1593.         add        eax,ebx                    ;eax = co17+co35
  1594.  
  1595.         mov        edi,ecx
  1596.         sub        esi,ebx
  1597.  
  1598.         shl        esi,12
  1599.         add        ecx,edx
  1600.         sub        edi,edx
  1601.  
  1602.         shl        edi,12
  1603.         mov        matr2[ebp+5*4],esi
  1604.         mov        matr2[ebp+2*4],edi
  1605.  
  1606. #ifdef FAST_IMUL
  1607.         imul    ecx,2896
  1608.         imul    eax,2896
  1609. #else
  1610.         shl        ecx,4                    ;b/1
  1611.         lea        esi,[eax*8+eax]            ;a/1
  1612.         lea        esi,[esi*4+esi]            ;a/2
  1613.         lea        edi,[ecx*8+ecx]            ;b/2
  1614.         lea        eax,[esi*4+eax]            ;a/3
  1615.         lea        edi,[edi*4+edi]            ;b/3
  1616.         shl        eax,4                    ;a/4
  1617.         lea        ecx,[edi*4+ecx]            ;b/4
  1618. #endif
  1619.  
  1620.         mov        matr2[ebp+7*4],eax
  1621.         mov        matr2[ebp+3*4],ecx
  1622.  
  1623. //    matr2[p+0] = C4*matr1[p  ];            (a)
  1624. //    matr2[p+1] = C4*matr1[p+4];            (b)
  1625.  
  1626.         mov        eax,matr1[ebp+0*4]
  1627.         mov        ebx,matr1[ebp+4*4]
  1628.  
  1629. #ifdef FAST_IMUL
  1630.         imul    ebx,2896
  1631.         imul    eax,2896
  1632. #else
  1633.         shl        ebx,4                    ;b/1
  1634.         lea        esi,[eax*8+eax]            ;a/1
  1635.         lea        esi,[esi*4+esi]            ;a/2
  1636.         lea        edi,[ebx*8+ebx]            ;b/2
  1637.         lea        eax,[esi*4+eax]            ;a/3
  1638.         lea        edi,[edi*4+edi]            ;b/3
  1639.         shl        eax,4                    ;a/4
  1640.         lea        ebx,[edi*4+ebx]            ;b/4
  1641. #endif
  1642.  
  1643.         mov        matr2[ebp+0*4],eax
  1644.         mov        matr2[ebp+1*4],ebx
  1645.  
  1646.         ret
  1647.     }
  1648.  
  1649.  
  1650. IDCT_vert_46:
  1651.     __asm {
  1652.         mov        ebp,4*8*4
  1653.         call    IDCT_vert_46_dorow
  1654.         mov        ebp,6*8*4
  1655. IDCT_vert_46_dorow:
  1656.         mov        eax,matr1[ebp+0*4]
  1657.         mov        ebx,matr1[ebp+4*4]
  1658.         mov        matr2[ebp+0*4],eax
  1659.         mov        matr2[ebp+1*4],ebx
  1660.  
  1661.         mov        esi,matr1[ebp+2*4]
  1662.         mov        edi,matr1[ebp+6*4]
  1663.  
  1664.         mov        eax,matr1[ebp+1*4]
  1665.         mov        ebx,matr1[ebp+3*4]
  1666.         add        eax,matr1[ebp+7*4]
  1667.         add        ebx,matr1[ebp+5*4]
  1668.  
  1669.         lea        edx,[esi+edi]
  1670.         sub        esi,edi
  1671.         lea        ecx,[eax+ebx]
  1672.         sub        eax,ebx
  1673.  
  1674.         mov        matr2[ebp+2*4],esi
  1675.         mov        matr2[ebp+3*4],edx
  1676.         mov        matr2[ebp+5*4],eax
  1677.         mov        matr2[ebp+7*4],ecx
  1678.  
  1679.         ret
  1680.     };
  1681. }
  1682.